library(ggplot2)
library(dplyr)
library(reshape2)
library(fst)
library(haven)

# create bars per year for local elections  

# create empty data frame
bar_data <- data_frame()

for(k in seq(1993, 2013, by = 4)){
  
  edu <- 
    read.fst(paste("udda_recoded", k, ".fst", sep = "")) %>% 
    mutate(edu_level = ifelse(education == "Grundskole", 1, 
                              ifelse(education == "Erhvervsfaglige praktik- og hovedforl?b", 2,
                                     ifelse(education == "Almengymnasiale uddannelser", 3, 
                                            ifelse(education == "Erhvervsgymnasiale uddannelser", 3, 
                                                   ifelse(education == "Korte videreg?ende uddannelser", 4, 
                                                          ifelse(education == "Mellemlange videreg?ende uddannelser", 5,  
                                                                 ifelse(education == "Bachelor", 5,  
                                                                        ifelse(education == "Lange videreg?ende uddannelser", 6,
                                                                               ifelse(education == "Forskeruddannelser", 7, NA))))))))))
  
  
  candidates_data <-
    read_sas(paste(raw_data, 
                   "grunddata/valgdata/", 
                   "kv", k, "_recodes_pnr_afid.sas7bdat", 
                   sep = "")) %>% 
    mutate(elected_kv = VALGT_JN == "J",
           run_kv     = 1) %>% 
    select(c("PNR", "elected_kv", "run_kv", "PARTI")) 
  
  suppressWarnings({
    data_an <- 
      left_join(edu, candidates_data, by = "PNR")
  })
  
  
  # group by edu categories for population, running candidates, and elected candidates 
  plot_data <- 
    data_frame(edu_level  = 1:7, 
               year       = k) %>%
    left_join(., data_an %>% 
                group_by(edu_level) %>% 
                summarize(population_margin = n()) %>% 
                mutate(population_margin = ifelse(is.na(population_margin), 0, population_margin)), 
              by =  "edu_level") %>% 
    left_join(., data_an %>%
                filter(.[,"run_kv"] == 1) %>%
                group_by(edu_level) %>% 
                summarize(candidate_margin = n()), 
              by =  "edu_level") %>% 
    left_join(., data_an %>%
                filter(.[,"elected_kv"] == 1) %>%
                group_by(edu_level) %>% 
                summarize(elected_margin = n()), 
              by =  "edu_level") %>%
    mutate(elected_margin = ifelse(is.na(elected_margin), 0, elected_margin),
           candidate_margin = ifelse(is.na(candidate_margin), 0, candidate_margin))
  
  bar_data <-
    bar_data %>%
    bind_rows(plot_data)  
  
  print(k)
}

# for parliamentarians:

bar_data_fv <- data_frame()

for(k in c(1990, 1994, 1998, 2001, 2005, 2007, 2011, 2015)){
  
  edu <- 
    read.fst(paste("udda_recoded", k, ".fst", sep = "")) %>% 
    #sample_n(1000000) %>% # for fitting
    mutate(edu_level = ifelse(education == "Grundskole", 1, 
                              ifelse(education == "Erhvervsfaglige praktik- og hovedforl?b", 2,
                                     ifelse(education == "Almengymnasiale uddannelser", 3, 
                                            ifelse(education == "Erhvervsgymnasiale uddannelser", 3, 
                                                   ifelse(education == "Korte videreg?ende uddannelser", 4, 
                                                          ifelse(education == "Mellemlange videreg?ende uddannelser", 5,  
                                                                 ifelse(education == "Bachelor", 5,  
                                                                        ifelse(education == "Lange videreg?ende uddannelser", 6,
                                                                               ifelse(education == "Forskeruddannelser", 7, NA))))))))))
  
  
  candidates_data <-
    read_sas(paste("fv", k, "_recodes_pnr_afid.sas7bdat", 
                   sep = "")) %>% 
    mutate(elected_fv = VALGT_JN == "J",
           run_fv     = 1) %>% 
    select(c("PNR", "elected_fv", "run_fv", "PARTI")) 
  
  suppressWarnings({
    data_an <- 
      left_join(edu, candidates_data, by = "PNR")
  })
  
  
  # group by edu categories for population, running candidates, and elected candidates 
  plot_data <- 
    data_frame(edu_level  = 1:7, 
               year       = k) %>%
    left_join(., data_an %>% 
                group_by(edu_level) %>% 
                summarize(population_margin = n()) %>% 
                mutate(population_margin = ifelse(is.na(population_margin), 0, population_margin)), 
              by =  "edu_level") %>% 
    left_join(., data_an %>%
                filter(.[,"run_fv"] == 1) %>%
                group_by(edu_level) %>% 
                summarize(parl_candidate_margin = n()), 
              by =  "edu_level") %>% 
    left_join(., data_an %>%
                filter(.[,"elected_fv"] == 1) %>%
                group_by(edu_level) %>% 
                summarize(parl_elected_margin = n()), 
              by =  "edu_level") %>%
    mutate(parl_elected_margin = ifelse(is.na(parl_elected_margin), 0, parl_elected_margin),
           parl_candidate_margin = ifelse(is.na(parl_candidate_margin), 0, parl_candidate_margin))
  
  bar_data_fv <-
    bar_data_fv %>%
    bind_rows(plot_data)  
  
  print(k)
}

bar_data <-
  as.data.frame(bar_data)

bar_data2 <- 
  melt(bar_data, 
       id = c("edu_level", "year")) %>%
  group_by(edu_level, variable) %>% 
  summarise(count = sum(value)) 

bar_data_fv <-
  as.data.frame(bar_data_fv) 

# subtract 2001 and 2005 from population margin in order not to count them twice

bar_data2_fv <- 
  bar_data_fv %>% 
  mutate(population_margin = ifelse(year == 2001 | year == 2005, 0, population_margin)) %>%
  melt(., 
       id = c("edu_level", "year")) %>%
  group_by(edu_level, variable) %>% 
  summarise(count = sum(value)) 

suppressWarnings({
  bar_data2 <- 
    bind_rows(bar_data2, bar_data2_fv) %>% 
    group_by(edu_level, variable) %>% 
    summarise(count = sum(count)) 
})

bar_data_count <- 
  bar_data2 %>% 
  group_by(variable) %>% 
  summarise(total =  sum(count))

bar_data2 <- 
  left_join(bar_data2, bar_data_count, by = "variable") %>% 
  mutate(share = count / total)


plot <-
  ggplot(data = bar_data2 %>% filter(variable != "population_margin"),
         aes(x = edu_level, 
             y = share, 
             alpha = variable) ) +
  geom_bar(stat = "identity", position = "dodge" ) +
  geom_bar(data = bar_data2 %>% filter(variable == "population_margin"),
           aes(x = edu_level, 
               y = share), 
           stat = "identity", fill = "white", colour = "black", alpha = 1/10 ) +
  theme_classic() + 
  scale_alpha_discrete(range = c(0.2, 1), "", 
                       labels = c("Running for municipality", 
                                  "Elected for municipality",
                                  "Running for parliament", 
                                  "Elected for parliament")) + 
  scale_x_continuous("", breaks = 1:7,
                     labels = c("Primary \nschool",
                                "Vocational \ntraining",
                                "High \nschool",
                                "Short \ntertiary",
                                "Medium lenght \ntertiary",
                                "Long \ntertiary",
                                "PhD")) + 
  scale_y_continuous("")
